/* -*- c -*- */

/*
 * This file is for the definitions of simd vectorized operations.
 *
 * Currently contains sse2 functions that are built on amd64, x32 or
 * non-generic builds (CFLAGS=-march=...)
 * In future it may contain other instruction sets like AVX or NEON detected
 * at runtime in which case it needs to be included indirectly via a file
 * compiled with special options (or use gcc target attributes) so the binary
 * stays portable.
 */


#ifndef __NPY_SIMD_INC
#define __NPY_SIMD_INC

#include "lowlevel_strided_loops.h"
#include "numpy/npy_common.h"
#include "numpy/npy_math.h"
#ifdef NPY_HAVE_SSE2_INTRINSICS
#include <emmintrin.h>
#if !defined(_MSC_VER) || _MSC_VER >= 1600
#include <immintrin.h>
#else
#undef __AVX2__
#undef __AVX512F__
#endif
#endif
#include <assert.h>
#include <stdlib.h>
#include <float.h>
#include <string.h> /* for memcpy */

#define VECTOR_SIZE_BYTES 16

/*
 * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
 * Very large step size can be as slow as processing it using scalar. The
 * value of 2097152 ( = 2MB) was chosen using 2 considerations:
 * 1) Typical linux kernel page size is 4Kb, but sometimes it could also be 2MB
 *    which is == 2097152 Bytes. For a step size as large as this, surely all
 *    the loads/stores of gather/scatter instructions falls on 16 different pages
 *    which one would think would slow down gather/scatter instructions.
 * 2) It additionally satisfies MAX_STEP_SIZE*16/esize < NPY_MAX_INT32 which
 *    allows us to use i32 version of gather/scatter (as opposed to the i64 version)
 *    without problems (step larger than NPY_MAX_INT32*esize/16 would require use of
 *    i64gather/scatter). esize = element size = 4/8 bytes for float/double.
 */
#define MAX_STEP_SIZE 2097152

static NPY_INLINE npy_uintp
abs_ptrdiff(char *a, char *b)
{
    return (a > b) ? (a - b) : (b - a);
}

/*
 * nomemoverlap - returns true if two strided arrays have an overlapping
 * region in memory. ip_size/op_size = size of the arrays which can be negative
 * indicating negative steps.
 */
static NPY_INLINE npy_bool
nomemoverlap(char *ip,
             npy_intp ip_size,
             char *op,
             npy_intp op_size)
{
    char *ip_start, *ip_end, *op_start, *op_end;
    if (ip_size < 0) {
        ip_start = ip + ip_size;
        ip_end = ip;
    }
    else {
        ip_start = ip;
        ip_end = ip + ip_size;
    }
    if (op_size < 0) {
        op_start = op + op_size;
        op_end = op;
    }
    else {
        op_start = op;
        op_end = op + op_size;
    }
    return (ip_start > op_end) | (op_start > ip_end);
}

#define IS_BINARY_STRIDE_ONE(esize, vsize) \
    ((steps[0] == esize) && \
     (steps[1] == esize) && \
     (steps[2] == esize) && \
     (abs_ptrdiff(args[2], args[0]) >= vsize) && \
     (abs_ptrdiff(args[2], args[1]) >= vsize))

/*
 * stride is equal to element size and input and destination are equal or
 * don't overlap within one register. The check of the steps against
 * esize also quarantees that steps are >= 0.
 */
#define IS_BLOCKABLE_UNARY(esize, vsize) \
    (steps[0] == (esize) && steps[0] == steps[1] && \
     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
     ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
      ((abs_ptrdiff(args[1], args[0]) == 0))))

/*
 * Avoid using SIMD for very large step sizes for several reasons:
 * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
 *    in which case we need two i64gather instructions and an additional vinsertf32x8
 *    instruction to load a single zmm register (since one i64gather instruction
 *    loads into a ymm register). This is not ideal for performance.
 * 2) Gather and scatter instructions can be slow when the loads/stores
 *    cross page boundaries.
 *
 * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
 * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
 * ensures this. The condition also requires that the input and output arrays
 * should have no overlap in memory.
 */
#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
    ((abs(steps[0]) < MAX_STEP_SIZE)  && \
     (abs(steps[1]) < MAX_STEP_SIZE)  && \
     (abs(steps[2]) < MAX_STEP_SIZE)  && \
     (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
     (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))

/*
 * 1) Output should be contiguous, can handle strided input data
 * 2) Input step should be smaller than MAX_STEP_SIZE for performance
 * 3) Input and output arrays should have no overlap in memory
 */
#define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \
    (steps[1] == (esize) && abs(steps[0]) < MAX_STEP_SIZE && \
     (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))

#define IS_BLOCKABLE_REDUCE(esize, vsize) \
    (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
     npy_is_aligned(args[1], (esize)) && \
     npy_is_aligned(args[0], (esize)))

#define IS_BLOCKABLE_BINARY(esize, vsize) \
    (steps[0] == steps[1] && steps[1] == steps[2] && steps[2] == (esize) && \
     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
     npy_is_aligned(args[0], (esize)) && \
     (abs_ptrdiff(args[2], args[0]) >= (vsize) || \
      abs_ptrdiff(args[2], args[0]) == 0) && \
     (abs_ptrdiff(args[2], args[1]) >= (vsize) || \
      abs_ptrdiff(args[2], args[1]) >= 0))

#define IS_BLOCKABLE_BINARY_SCALAR1(esize, vsize) \
    (steps[0] == 0 && steps[1] == steps[2] && steps[2] == (esize) && \
     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[1], (esize)) && \
     ((abs_ptrdiff(args[2], args[1]) >= (vsize)) || \
      (abs_ptrdiff(args[2], args[1]) == 0)) && \
     abs_ptrdiff(args[2], args[0]) >= (esize))

#define IS_BLOCKABLE_BINARY_SCALAR2(esize, vsize) \
    (steps[1] == 0 && steps[0] == steps[2] && steps[2] == (esize) && \
     npy_is_aligned(args[2], (esize)) && npy_is_aligned(args[0], (esize)) && \
     ((abs_ptrdiff(args[2], args[0]) >= (vsize)) || \
      (abs_ptrdiff(args[2], args[0]) == 0)) && \
     abs_ptrdiff(args[2], args[1]) >= (esize))

#undef abs_ptrdiff

#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
    (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
     npy_is_aligned(args[1], (esize)) && \
     npy_is_aligned(args[0], (esize)))

#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
    (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
     npy_is_aligned(args[1], (esize)))

#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
    (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
     npy_is_aligned(args[0], (esize)))

/* align var to alignment */
#define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
    npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
                                                alignment, n);\
    for(i = 0; i < peel; i++)

#define LOOP_BLOCKED(type, vsize)\
    for(; i < npy_blocked_end(peel, sizeof(type), vsize, n);\
            i += (vsize / sizeof(type)))

#define LOOP_BLOCKED_END\
    for (; i < n; i++)


/*
 * Dispatcher functions
 * decide whether the operation can be vectorized and run it
 * if it was run returns true and false if nothing was done
 */

/*
 *****************************************************************************
 **                           CMPLX DISPATCHERS
 *****************************************************************************
 */

/**begin repeat
 * #TYPE = CFLOAT, CDOUBLE#
 * #type= npy_float, npy_double#
 * #esize = 8, 16#
 */

/**begin repeat1
 *  #func = add, subtract, multiply#
 */

#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE NPY_GCC_TARGET_AVX512F void
AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps);
#endif

static NPY_INLINE int
run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
    if (IS_BINARY_STRIDE_ONE(@esize@, 64)) {
        AVX512F_@func@_@TYPE@(args, dimensions, steps);
        return 1;
    }
    else
        return 0;
#endif
    return 0;
}

/**end repeat1**/

/**begin repeat1
 *  #func = square, absolute, conjugate#
 *  #outsize = 1, 2, 1#
 *  #max_stride = 2, 8, 8#
 */

#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE NPY_GCC_TARGET_AVX512F void
AVX512F_@func@_@TYPE@(@type@*, @type@*, const npy_intp n, const npy_intp stride);
#endif

static NPY_INLINE int
run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
    if ((IS_OUTPUT_BLOCKABLE_UNARY((npy_uint)(@esize@/@outsize@), 64)) && (labs(steps[0]) < 2*@max_stride@*@esize@)) {
        AVX512F_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
        return 1;
    }
    else
        return 0;
#endif
    return 0;
}

/**end repeat1**/
/**end repeat**/

/*
 *****************************************************************************
 **                           FLOAT DISPATCHERS
 *****************************************************************************
 */

/**begin repeat
 * #type = npy_float, npy_double, npy_longdouble#
 * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
 * #EXISTS = 1, 1, 0#
 */

/**begin repeat1
 *  #func = maximum, minimum#
 */

#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
static NPY_INLINE NPY_GCC_TARGET_AVX512F void
AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
#endif

static NPY_INLINE int
run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
    if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
        AVX512F_@func@_@TYPE@(args, dimensions, steps);
        return 1;
    }
    else
        return 0;
#endif
    return 0;
}


/**end repeat1**/
/**end repeat**/

/**begin repeat
 * #ISA = FMA, AVX512F#
 * #isa = fma, avx512f#
 * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
 * #REGISTER_SIZE = 32, 64#
 */

/* prototypes */

/**begin repeat1
 * #type = npy_float, npy_double#
 * #TYPE = FLOAT, DOUBLE#
 */

/**begin repeat2
 *  #func = sqrt, absolute, square, reciprocal, rint, floor, ceil, trunc#
 */

#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE NPY_GCC_TARGET_@ISA@ void
@ISA@_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n, const npy_intp stride);
#endif

static NPY_INLINE int
run_unary_@isa@_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), @REGISTER_SIZE@)) {
        @ISA@_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
        return 1;
    }
    else
        return 0;
#endif
    return 0;
}

/**end repeat2**/
/**end repeat1**/

/**begin repeat1
 * #func = exp, log#
 */

#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE void
@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp stride);
#endif

static NPY_INLINE int
run_unary_@isa@_@func@_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
        @ISA@_@func@_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
        return 1;
    }
    else
        return 0;
#endif
    return 0;
}

/**end repeat1**/

#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE void
@ISA@_sincos_FLOAT(npy_float *, npy_float *, const npy_intp n, const npy_intp steps, NPY_TRIG_OP);
#endif

static NPY_INLINE int
run_unary_@isa@_sincos_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps, NPY_TRIG_OP my_trig_op)
{
#if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), @REGISTER_SIZE@)) {
        @ISA@_sincos_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0], my_trig_op);
        return 1;
    }
    else
        return 0;
#endif
    return 0;
}

/**end repeat**/


/**begin repeat
 * Float types
 *  #type = npy_float, npy_double, npy_longdouble#
 *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
 *  #vector = 1, 1, 0#
 */

/**begin repeat1
 * #func = sqrt, absolute, negative, minimum, maximum#
 * #check = IS_BLOCKABLE_UNARY*3, IS_BLOCKABLE_REDUCE*2 #
 * #name = unary*3, unary_reduce*2#
 */

#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS

/* prototypes */
static void
sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);

#endif

static NPY_INLINE int
run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
    if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) {
        sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
        return 1;
    }
#endif
    return 0;
}

/**end repeat1**/

/**begin repeat1
 * Arithmetic
 * # kind = add, subtract, multiply, divide#
 */

#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS

/* prototypes */
static void
sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
                          npy_intp n);
static void
sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
                                  npy_intp n);
static void
sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
                                  npy_intp n);

#endif

static NPY_INLINE int
run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
    @type@ * ip1 = (@type@ *)args[0];
    @type@ * ip2 = (@type@ *)args[1];
    @type@ * op = (@type@ *)args[2];
    npy_intp n = dimensions[0];
#if defined __AVX512F__
    const npy_uintp vector_size_bytes = 64;
#elif defined __AVX2__
    const npy_uintp vector_size_bytes = 32;
#else
    const npy_uintp vector_size_bytes = 32;
#endif
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
        sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
        sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
        sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}

/**end repeat1**/

/**begin repeat1
 * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
 *         logical_and, logical_or#
 * #simd = 1, 1, 1, 1, 1, 1, 0, 0#
 */

#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS

/* prototypes */
static void
sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
                          npy_intp n);
static void
sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
                                  npy_intp n);
static void
sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2,
                                  npy_intp n);

#endif

static NPY_INLINE int
run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if @vector@ && @simd@ && defined NPY_HAVE_SSE2_INTRINSICS
    @type@ * ip1 = (@type@ *)args[0];
    @type@ * ip2 = (@type@ *)args[1];
    npy_bool * op = (npy_bool *)args[2];
    npy_intp n = dimensions[0];
    /* argument one scalar */
    if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
        sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
        return 1;
    }
    /* argument two scalar */
    else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
        sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
        return 1;
    }
    else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), VECTOR_SIZE_BYTES)) {
        sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
        return 1;
    }
#endif
    return 0;
}

/**end repeat1**/

/**begin repeat1
 * #kind = isnan, isfinite, isinf, signbit#
 */

#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS

static void
sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n);

#endif

static NPY_INLINE int
run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
    if (steps[0] == sizeof(@type@) && steps[1] == 1 &&
        npy_is_aligned(args[0], sizeof(@type@))) {
        sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]);
        return 1;
    }
#endif
    return 0;
}

/**end repeat1**/

/**end repeat**/

/*
 *****************************************************************************
 **                           BOOL DISPATCHERS
 *****************************************************************************
 */

/**begin repeat
 * # kind = logical_or, logical_and#
 */

#if defined NPY_HAVE_SSE2_INTRINSICS
static void
sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2,
                        npy_intp n);

static void
sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n);
#endif

static NPY_INLINE int
run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if defined NPY_HAVE_SSE2_INTRINSICS
    if (sizeof(npy_bool) == 1 &&
            IS_BLOCKABLE_BINARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
        sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
                               (npy_bool*)args[1], dimensions[0]);
        return 1;
    }
#endif
    return 0;
}


static NPY_INLINE int
run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if defined NPY_HAVE_SSE2_INTRINSICS
    if (sizeof(npy_bool) == 1 &&
            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
        sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
                                dimensions[0]);
        return 1;
    }
#endif
    return 0;
}

/**end repeat**/

/**begin repeat
 * # kind = absolute, logical_not#
 */

#if defined NPY_HAVE_SSE2_INTRINSICS
static void
sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n);
#endif

static NPY_INLINE int
run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
#if defined NPY_HAVE_SSE2_INTRINSICS
    if (sizeof(npy_bool) == 1 &&
            IS_BLOCKABLE_UNARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
        sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
        return 1;
    }
#endif
    return 0;
}

/**end repeat**/

#ifdef NPY_HAVE_SSE2_INTRINSICS

/*
 * Vectorized operations
 */
/*
 *****************************************************************************
 **                           FLOAT LOOPS
 *****************************************************************************
 */

/**begin repeat
* horizontal reductions on a vector
* # VOP = min, max#
*/

static NPY_INLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
{
    npy_float r;
    __m128 tmp = _mm_movehl_ps(v, v);                   /* c     d     ... */
    __m128 m = _mm_@VOP@_ps(v, tmp);                    /* m(ac) m(bd) ... */
    tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */
    _mm_store_ss(&r, _mm_@VOP@_ps(tmp, m));             /* m(acbd) ... */
    return r;
}

static NPY_INLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
{
    npy_double r;
    __m128d tmp = _mm_unpackhi_pd(v, v);    /* b     b */
    _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */
    return r;
}

/**end repeat**/

/**begin repeat
 *  #type = npy_float, npy_double#
 *  #TYPE = FLOAT, DOUBLE#
 *  #scalarf = npy_sqrtf, npy_sqrt#
 *  #c = f, #
 *  #vtype = __m128, __m128d#
 *  #vtype256 = __m256, __m256d#
 *  #vtype512 = __m512, __m512d#
 *  #vpre = _mm, _mm#
 *  #vpre256 = _mm256, _mm256#
 *  #vpre512 = _mm512, _mm512#
 *  #vsuf = ps, pd#
 *  #vsufs = ss, sd#
 *  #nan = NPY_NANF, NPY_NAN#
 *  #double = 0, 1#
 *  #cast = _mm_castps_si128, _mm_castpd_si128#
 */


/**begin repeat1
* Arithmetic
* # kind = add, subtract, multiply, divide#
* # OP = +, -, *, /#
* # VOP = add, sub, mul, div#
*/

static void
sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
#ifdef  __AVX512F__
    const npy_intp vector_size_bytes = 64;
    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
        op[i] = ip1[i] @OP@ ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(@type@, vector_size_bytes) {
                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
                @vpre512@_store_@vsuf@(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(@type@, vector_size_bytes) {
                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
                @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
                @vpre512@_store_@vsuf@(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(@type@, vector_size_bytes) {
            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
            @vpre512@_store_@vsuf@(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(@type@, vector_size_bytes) {
            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
            @vpre512@_store_@vsuf@(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(@type@, vector_size_bytes) {
                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
                @vpre512@_store_@vsuf@(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(@type@, vector_size_bytes) {
                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
                @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
                @vpre512@_store_@vsuf@(&op[i], c);
            }
        }
    }
#elif __AVX2__
    const npy_intp vector_size_bytes = 32;
    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
        op[i] = ip1[i] @OP@ ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
            npy_is_aligned(&ip2[i], vector_size_bytes)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(@type@, vector_size_bytes) {
                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
                @vpre256@_store_@vsuf@(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(@type@, vector_size_bytes) {
                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
                @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
                @vpre256@_store_@vsuf@(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(@type@, vector_size_bytes) {
            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
            @vpre256@_store_@vsuf@(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(@type@, vector_size_bytes) {
            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
            @vpre256@_store_@vsuf@(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(@type@, vector_size_bytes) {
                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
                @vpre256@_store_@vsuf@(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(@type@, vector_size_bytes) {
                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
                @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
                @vpre256@_store_@vsuf@(&op[i], c);
            }
        }
    }
#else
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
        op[i] = ip1[i] @OP@ ip2[i];
    /* lots of specializations, to squeeze out max performance */
    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) &&
            npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
        if (ip1 == ip2) {
            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
                @vpre@_store_@vsuf@(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
                @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
                @vpre@_store_@vsuf@(&op[i], c);
            }
        }
    }
    else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
            @vpre@_store_@vsuf@(&op[i], c);
        }
    }
    else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
            @vpre@_store_@vsuf@(&op[i], c);
        }
    }
    else {
        if (ip1 == ip2) {
            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
                @vpre@_store_@vsuf@(&op[i], c);
            }
        }
        else {
            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
                @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
                @vpre@_store_@vsuf@(&op[i], c);
            }
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] @OP@ ip2[i];
    }
}


static void
sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
#ifdef __AVX512F__
    const npy_intp vector_size_bytes = 64;
    const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
        op[i] = ip1[0] @OP@ ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(@type@, vector_size_bytes) {
            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
            @vpre512@_store_@vsuf@(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(@type@, vector_size_bytes) {
            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
            @vpre512@_store_@vsuf@(&op[i], c);
        }
    }


#elif __AVX2__
    const npy_intp vector_size_bytes = 32;
    const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
        op[i] = ip1[0] @OP@ ip2[i];
    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
        LOOP_BLOCKED(@type@, vector_size_bytes) {
            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
            @vpre256@_store_@vsuf@(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(@type@, vector_size_bytes) {
            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
            @vpre256@_store_@vsuf@(&op[i], c);
        }
    }
#else
    const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
        op[i] = ip1[0] @OP@ ip2[i];
    if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
            @vpre@_store_@vsuf@(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
            @vpre@_store_@vsuf@(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[0] @OP@ ip2[i];
    }
}


static void
sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
#ifdef __AVX512F__
    const npy_intp vector_size_bytes = 64;
    const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
        op[i] = ip1[i] @OP@ ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(@type@, vector_size_bytes) {
            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
            @vpre512@_store_@vsuf@(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(@type@, vector_size_bytes) {
            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
            @vpre512@_store_@vsuf@(&op[i], c);
        }
    }

#elif __AVX2__
    const npy_intp vector_size_bytes = 32;
    const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
        op[i] = ip1[i] @OP@ ip2[0];
    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
        LOOP_BLOCKED(@type@, vector_size_bytes) {
            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
            @vpre256@_store_@vsuf@(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(@type@, vector_size_bytes) {
            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
            @vpre256@_store_@vsuf@(&op[i], c);
        }
    }
#else
    const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
        op[i] = ip1[i] @OP@ ip2[0];
    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
            @vpre@_store_@vsuf@(&op[i], c);
        }
    }
    else {
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
            @vpre@_store_@vsuf@(&op[i], c);
        }
    }
#endif
    LOOP_BLOCKED_END {
        op[i] = ip1[i] @OP@ ip2[0];
    }
}

/**end repeat1**/

/*
 * compress 4 vectors to 4/8 bytes in op with filled with 0 or 1
 * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
 * calling convention leading to C2719 on 32 bit, see #4795
 */
static NPY_INLINE void
sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
                              npy_bool * op)
{
    const __m128i mask = @vpre@_set1_epi8(0x1);
    __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
    __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(*r4));
    __m128i rr = @vpre@_packs_epi16(ir1, ir2);
#if @double@
    rr = @vpre@_packs_epi16(rr, rr);
    rr = @vpre@_and_si128(rr, mask);
    @vpre@_storel_epi64((__m128i*)op, rr);
#else
    rr = @vpre@_and_si128(rr, mask);
    @vpre@_storeu_si128((__m128i*)op, rr);
#endif
}

static void
sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
{
    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
        op[i] = npy_signbit(ip1[i]) != 0;
    }
    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
        int r = @vpre@_movemask_@vsuf@(a);
        if (sizeof(@type@) == 8) {
            op[i] = r & 1;
            op[i + 1] = (r >> 1);
        }
        else {
            op[i] = r & 1;
            op[i + 1] = (r >> 1) & 1;
            op[i + 2] = (r >> 2) & 1;
            op[i + 3] = (r >> 3);
        }
    }
    LOOP_BLOCKED_END {
        op[i] = npy_signbit(ip1[i]) != 0;
    }
}

/**begin repeat1
 * #kind = isnan, isfinite, isinf#
 * #var = 0, 1, 2#
 */

static void
sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
{
#if @var@ != 0 /* isinf/isfinite */
    /* signbit mask 0x7FFFFFFF after andnot */
    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
    const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(),
                                             @vpre@_setzero_@vsuf@());
#if @double@
    const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX);
#else
    const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX);
#endif
#endif
    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
        op[i] = npy_@kind@(ip1[i]) != 0;
    }
    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ r1, r2, r3, r4;
#if @var@ != 0 /* isinf/isfinite */
        /* fabs via masking of sign bit */
        r1 = @vpre@_andnot_@vsuf@(mask, a);
        r2 = @vpre@_andnot_@vsuf@(mask, b);
        r3 = @vpre@_andnot_@vsuf@(mask, c);
        r4 = @vpre@_andnot_@vsuf@(mask, d);
#if @var@ == 1 /* isfinite */
        /* negative compare against max float, nan is always true */
        r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax);
        r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax);
        r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax);
        r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax);
#else /* isinf */
        r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1);
        r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2);
        r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3);
        r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4);
#endif
        /* flip results to what we want (andnot as there is no sse not) */
        r1 = @vpre@_andnot_@vsuf@(r1, ones);
        r2 = @vpre@_andnot_@vsuf@(r2, ones);
        r3 = @vpre@_andnot_@vsuf@(r3, ones);
        r4 = @vpre@_andnot_@vsuf@(r4, ones);
#endif
#if @var@ == 0 /* isnan */
        r1 = @vpre@_cmpneq_@vsuf@(a, a);
        r2 = @vpre@_cmpneq_@vsuf@(b, b);
        r3 = @vpre@_cmpneq_@vsuf@(c, c);
        r4 = @vpre@_cmpneq_@vsuf@(d, d);
#endif
        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
    }
    LOOP_BLOCKED_END {
        op[i] = npy_@kind@(ip1[i]) != 0;
    }
}

/**end repeat1**/

/**begin repeat1
 * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
 * #OP = ==, !=, <, <=, >, >=#
 * #VOP = cmpeq, cmpneq, cmplt, cmple, cmpgt, cmpge#
*/

/* sets invalid fpu flag on QNaN for consistency with packed compare */
static NPY_INLINE int
sse2_ordered_cmp_@kind@_@TYPE@(const @type@ a, const @type@ b)
{
    @vtype@ one = @vpre@_set1_@vsuf@(1);
    @type@ tmp;
    @vtype@ v = @vpre@_@VOP@_@vsufs@(@vpre@_load_@vsufs@(&a),
                                     @vpre@_load_@vsufs@(&b));
    v = @vpre@_and_@vsuf@(v, one);
    @vpre@_store_@vsufs@(&tmp, v);
    return tmp;
}

static void
sse2_binary_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
    }
    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
        @vtype@ a1 = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ b1 = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ c1 = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ d1 = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ a2 = @vpre@_loadu_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ b2 = @vpre@_loadu_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ c2 = @vpre@_loadu_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ d2 = @vpre@_loadu_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a1, a2);
        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b1, b2);
        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c1, c2);
        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d1, d2);
        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
    }
    LOOP_BLOCKED_END {
        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[i]);
    }
}


static void
sse2_binary_scalar1_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
    @vtype@ s = @vpre@_set1_@vsuf@(ip1[0]);
    LOOP_BLOCK_ALIGN_VAR(ip2, @type@, VECTOR_SIZE_BYTES) {
        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
    }
    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
        @vtype@ a = @vpre@_load_@vsuf@(&ip2[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ b = @vpre@_load_@vsuf@(&ip2[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ c = @vpre@_load_@vsuf@(&ip2[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ d = @vpre@_load_@vsuf@(&ip2[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(s, a);
        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(s, b);
        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(s, c);
        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(s, d);
        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
    }
    LOOP_BLOCKED_END {
        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[0], ip2[i]);
    }
}


static void
sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
    @vtype@ s = @vpre@_set1_@vsuf@(ip2[0]);
    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
    }
    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
        @vtype@ r1 = @vpre@_@VOP@_@vsuf@(a, s);
        @vtype@ r2 = @vpre@_@VOP@_@vsuf@(b, s);
        @vtype@ r3 = @vpre@_@VOP@_@vsuf@(c, s);
        @vtype@ r4 = @vpre@_@VOP@_@vsuf@(d, s);
        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
    }
    LOOP_BLOCKED_END {
        op[i] = sse2_ordered_cmp_@kind@_@TYPE@(ip1[i], ip2[0]);
    }
}
/**end repeat1**/

static void
sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
{
    /* align output to VECTOR_SIZE_BYTES bytes */
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
        op[i] = @scalarf@(ip[i]);
    }
    assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
            @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
        }
    }
    else {
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
            @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
        }
    }
    LOOP_BLOCKED_END {
        op[i] = @scalarf@(ip[i]);
    }
}


static NPY_INLINE
@type@ scalar_abs_@type@(@type@ v)
{
    /* add 0 to clear -0.0 */
    return (v > 0 ? v: -v) + 0;
}

static NPY_INLINE
@type@ scalar_neg_@type@(@type@ v)
{
    return -v;
}

/**begin repeat1
 * #kind = absolute, negative#
 * #VOP = andnot, xor#
 * #scalar = scalar_abs, scalar_neg#
 **/
static void
sse2_@kind@_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
{
    /*
     * get 0x7FFFFFFF mask (everything but signbit set)
     * float & ~mask will remove the sign, float ^ mask flips the sign
     * this is equivalent to how the compiler implements fabs on amd64
     */
    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);

    /* align output to VECTOR_SIZE_BYTES bytes */
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
        op[i] = @scalar@_@type@(ip[i]);
    }
    assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
            @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
            @vpre@_store_@vsuf@(&op[i], @vpre@_@VOP@_@vsuf@(mask, a));
        }
    }
    else {
        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
            @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
            @vpre@_store_@vsuf@(&op[i], @vpre@_@VOP@_@vsuf@(mask, a));
        }
    }
    LOOP_BLOCKED_END {
        op[i] = @scalar@_@type@(ip[i]);
    }
}
/**end repeat1**/


/**begin repeat1
 * #kind = maximum, minimum#
 * #VOP = max, min#
 * #OP = >=, <=#
 **/
/* arguments swapped as unary reduce has the swapped compared to unary */
static void
sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
{
    const npy_intp stride = VECTOR_SIZE_BYTES / (npy_intp)sizeof(@type@);
    LOOP_BLOCK_ALIGN_VAR(ip, @type@, VECTOR_SIZE_BYTES) {
        /* Order of operations important for MSVC 2015 */
        *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
    }
    assert(n < stride || npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES));
    if (i + 3 * stride <= n) {
        /* load the first elements */
        @vtype@ c1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
        @vtype@ c2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
        i += 2 * stride;

        /* minps/minpd will set invalid flag if nan is encountered */
        npy_clear_floatstatus_barrier((char*)&c1);
        LOOP_BLOCKED(@type@, 2 * VECTOR_SIZE_BYTES) {
            @vtype@ v1 = @vpre@_load_@vsuf@((@type@*)&ip[i]);
            @vtype@ v2 = @vpre@_load_@vsuf@((@type@*)&ip[i + stride]);
            c1 = @vpre@_@VOP@_@vsuf@(c1, v1);
            c2 = @vpre@_@VOP@_@vsuf@(c2, v2);
        }
        c1 = @vpre@_@VOP@_@vsuf@(c1, c2);

        if (npy_get_floatstatus_barrier((char*)&c1) & NPY_FPE_INVALID) {
            *op = @nan@;
        }
        else {
            @type@ tmp = sse2_horizontal_@VOP@_@vtype@(c1);
            /* Order of operations important for MSVC 2015 */
            *op  = (*op @OP@ tmp || npy_isnan(*op)) ? *op : tmp;
        }
    }
    LOOP_BLOCKED_END {
        /* Order of operations important for MSVC 2015 */
        *op  = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
    }
    npy_clear_floatstatus_barrier((char*)op);
}
/**end repeat1**/

/**end repeat**/

/* bunch of helper functions used in ISA_exp/log_FLOAT*/

#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_get_full_load_mask_ps(void)
{
    return _mm256_set1_ps(-1.0);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
fma_get_full_load_mask_pd(void)
{
    return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
{
    float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
                            1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
    float* addr = maskint + num_lanes - num_elem;
    return _mm256_loadu_ps(addr);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
{
    npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
    npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
    return _mm256_loadu_si256((__m256i*) addr);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_masked_gather_ps(__m256 src,
                     npy_float* addr,
                     __m256i vindex,
                     __m256 mask)
{
    return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
fma_masked_gather_pd(__m256d src,
                     npy_double* addr,
                     __m128i vindex,
                     __m256d mask)
{
    return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_masked_load_ps(__m256 mask, npy_float* addr)
{
    return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
fma_masked_load_pd(__m256i mask, npy_double* addr)
{
    return _mm256_maskload_pd(addr, mask);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
{
    return _mm256_blendv_ps(x, val, mask);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
{
    return _mm256_blendv_pd(x, val, mask);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_blend(__m256 x, __m256 y, __m256 ymask)
{
    return _mm256_blendv_ps(x, y, ymask);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_invert_mask_ps(__m256 ymask)
{
    return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
fma_invert_mask_pd(__m256i ymask)
{
    return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_should_calculate_sine(__m256i k, __m256i andop, __m256i cmp)
{
   return _mm256_cvtepi32_ps(
                _mm256_cmpeq_epi32(_mm256_and_si256(k, andop), cmp));
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_should_negate(__m256i k, __m256i andop, __m256i cmp)
{
    return fma_should_calculate_sine(k, andop, cmp);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_get_exponent(__m256 x)
{
    /*
     * Special handling of denormals:
     * 1) Multiply denormal elements with 2**100 (0x71800000)
     * 2) Get the 8 bits of unbiased exponent
     * 3) Subtract 100 from exponent of denormals
     */

    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);

    __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
    x = _mm256_blendv_ps(x, temp, denormal_mask);

    __m256 exp = _mm256_cvtepi32_ps(
                    _mm256_sub_epi32(
                        _mm256_srli_epi32(
                            _mm256_castps_si256(x), 23),_mm256_set1_epi32(0x7E)));

    __m256 denorm_exp = _mm256_sub_ps(exp, _mm256_set1_ps(100.0f));
    return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
fma_get_mantissa(__m256 x)
{
    /*
     * Special handling of denormals:
     * 1) Multiply denormal elements with 2**100 (0x71800000)
     * 2) Get the 23 bits of mantissa
     * 3) Mantissa for denormals is not affected by the multiplication
     */

    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);

    __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
    x = _mm256_blendv_ps(x, temp, denormal_mask);

    __m256i mantissa_bits = _mm256_set1_epi32(0x7fffff);
    __m256i exp_126_bits  = _mm256_set1_epi32(126 << 23);
    return _mm256_castsi256_ps(
                _mm256_or_si256(
                    _mm256_and_si256(
                        _mm256_castps_si256(x), mantissa_bits), exp_126_bits));
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX2 __m256
fma_scalef_ps(__m256 poly, __m256 quadrant)
{
    /*
     * Handle denormals (which occur when quadrant <= -125):
     * 1) This function computes poly*(2^quad) by adding the exponent of
     poly to quad
     * 2) When quad <= -125, the output is a denormal and the above logic
     breaks down
     * 3) To handle such cases, we split quadrant: -125 + (quadrant + 125)
     * 4) poly*(2^-125) is computed the usual way
     * 5) 2^(quad-125) can be computed by: 2 << abs(quad-125)
     * 6) The final div operation generates the denormal
     */
     __m256 minquadrant = _mm256_set1_ps(-125.0f);
     __m256 denormal_mask = _mm256_cmp_ps(quadrant, minquadrant, _CMP_LE_OQ);
     if (_mm256_movemask_ps(denormal_mask) != 0x0000) {
        __m256 quad_diff = _mm256_sub_ps(quadrant, minquadrant);
        quad_diff = _mm256_sub_ps(_mm256_setzero_ps(), quad_diff);
        quad_diff = _mm256_blendv_ps(_mm256_setzero_ps(), quad_diff, denormal_mask);
        __m256i two_power_diff = _mm256_sllv_epi32(
                                   _mm256_set1_epi32(1), _mm256_cvtps_epi32(quad_diff));
        quadrant = _mm256_max_ps(quadrant, minquadrant); //keep quadrant >= -126
        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
        poly = _mm256_castsi256_ps(
                   _mm256_add_epi32(
                       _mm256_castps_si256(poly), exponent));
        __m256 denorm_poly = _mm256_div_ps(poly, _mm256_cvtepi32_ps(two_power_diff));
        return _mm256_blendv_ps(poly, denorm_poly, denormal_mask);
     }
     else {
        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
        poly = _mm256_castsi256_ps(
                   _mm256_add_epi32(
                       _mm256_castps_si256(poly), exponent));
        return poly;
     }
}

/**begin repeat
 *  #vsub = ps, pd#
 *  #vtype = __m256, __m256d#
 */
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
fma_abs_@vsub@(@vtype@ x)
{
    return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
fma_reciprocal_@vsub@(@vtype@ x)
{
    return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
fma_rint_@vsub@(@vtype@ x)
{
    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
fma_floor_@vsub@(@vtype@ x)
{
    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
fma_ceil_@vsub@(@vtype@ x)
{
    return _mm256_round_@vsub@(x, _MM_FROUND_TO_POS_INF);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
fma_trunc_@vsub@(@vtype@ x)
{
    return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO);
}
/**end repeat**/
#endif

#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
avx512_get_full_load_mask_ps(void)
{
    return 0xFFFF;
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
avx512_get_full_load_mask_pd(void)
{
    return 0xFF;
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
{
    return (0x0001 << num_elem) - 0x0001;
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
{
    return (0x01 << num_elem) - 0x01;
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_masked_gather_ps(__m512 src,
                        npy_float* addr,
                        __m512i vindex,
                        __mmask16 kmask)
{
    return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
avx512_masked_gather_pd(__m512d src,
                        npy_double* addr,
                        __m256i vindex,
                        __mmask8 kmask)
{
    return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
{
    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
{
    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
{
    return _mm512_mask_blend_ps(mask, x, val);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
{
    return _mm512_mask_blend_pd(mask, x, val);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
{
    return _mm512_mask_mov_ps(x, ymask, y);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
avx512_invert_mask_ps(__mmask16 ymask)
{
    return _mm512_knot(ymask);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
avx512_invert_mask_pd(__mmask8 ymask)
{
    return _mm512_knot(ymask);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
avx512_should_calculate_sine(__m512i k, __m512i andop, __m512i cmp)
{
    return _mm512_cmpeq_epi32_mask(_mm512_and_epi32(k, andop), cmp);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
avx512_should_negate(__m512i k, __m512i andop, __m512i cmp)
{
    return avx512_should_calculate_sine(k, andop, cmp);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_get_exponent(__m512 x)
{
    return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_get_mantissa(__m512 x)
{
    return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
avx512_scalef_ps(__m512 poly, __m512 quadrant)
{
    return _mm512_scalef_ps(poly, quadrant);
}
/**begin repeat
 *  #vsub  = ps, pd#
 *  #type= npy_float, npy_double#
 *  #epi_vsub  = epi32, epi64#
 *  #vtype = __m512, __m512d#
 *  #mask = __mmask16, __mmask8#
 *  #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
 *  #neg_mask = 0x80000000, 0x8000000000000000#
 *  #perm_ = 0xb1, 0x55#
 *  #cmpx_img_mask = 0xAAAA, 0xAA#
 *  #cmpx_re_mask = 0x5555, 0x55#
 *  #INF = NPY_INFINITYF, NPY_INFINITY#
 *  #NAN = NPY_NANF, NPY_NAN#
 */
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_abs_@vsub@(@vtype@ x)
{
    return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x,
				    _mm512_set1_@epi_vsub@ (@and_const@));
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_reciprocal_@vsub@(@vtype@ x)
{
    return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_rint_@vsub@(@vtype@ x)
{
    return _mm512_roundscale_@vsub@(x, 0x08);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_floor_@vsub@(@vtype@ x)
{
    return _mm512_roundscale_@vsub@(x, 0x09);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_ceil_@vsub@(@vtype@ x)
{
    return _mm512_roundscale_@vsub@(x, 0x0A);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_trunc_@vsub@(@vtype@ x)
{
    return _mm512_roundscale_@vsub@(x, 0x0B);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_hadd_@vsub@(const @vtype@ x)
{
    return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_hsub_@vsub@(const @vtype@ x)
{
    return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_cabsolute_@vsub@(const @vtype@ x1,
                        const @vtype@ x2,
                        const __m512i re_indices,
                        const __m512i im_indices)
{
    @vtype@ inf = _mm512_set1_@vsub@(@INF@);
    @vtype@ nan = _mm512_set1_@vsub@(@NAN@);
    @vtype@ x1_abs = avx512_abs_@vsub@(x1);
    @vtype@ x2_abs = avx512_abs_@vsub@(x2);
    @vtype@ re = _mm512_permutex2var_@vsub@(x1_abs, re_indices, x2_abs);
    @vtype@ im = _mm512_permutex2var_@vsub@(x1_abs, im_indices , x2_abs);
    /*
     * If real or imag = INF, then convert it to inf + j*inf
     * Handles: inf + j*nan, nan + j*inf
     */
    @mask@ re_infmask = _mm512_cmp_@vsub@_mask(re, inf, _CMP_EQ_OQ);
    @mask@ im_infmask = _mm512_cmp_@vsub@_mask(im, inf, _CMP_EQ_OQ);
    im = _mm512_mask_mov_@vsub@(im, re_infmask, inf);
    re = _mm512_mask_mov_@vsub@(re, im_infmask, inf);

    /*
     * If real or imag = NAN, then convert it to nan + j*nan
     * Handles: x + j*nan, nan + j*x
     */
    @mask@ re_nanmask = _mm512_cmp_@vsub@_mask(re, re, _CMP_NEQ_UQ);
    @mask@ im_nanmask = _mm512_cmp_@vsub@_mask(im, im, _CMP_NEQ_UQ);
    im = _mm512_mask_mov_@vsub@(im, re_nanmask, nan);
    re = _mm512_mask_mov_@vsub@(re, im_nanmask, nan);

    @vtype@ larger  = _mm512_max_@vsub@(re, im);
    @vtype@ smaller = _mm512_min_@vsub@(im, re);

    /*
     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
     */
    @mask@ zeromask = _mm512_cmp_@vsub@_mask(larger, _mm512_setzero_@vsub@(), _CMP_EQ_OQ);
    @mask@ infmask = _mm512_cmp_@vsub@_mask(smaller, inf, _CMP_EQ_OQ);
    @mask@ div_mask = _mm512_knot(_mm512_kor(zeromask, infmask));
    @vtype@ ratio = _mm512_maskz_div_@vsub@(div_mask, smaller, larger);
    @vtype@ hypot = _mm512_sqrt_@vsub@(_mm512_fmadd_@vsub@(
                                        ratio, ratio, _mm512_set1_@vsub@(1.0f)));
    return _mm512_mul_@vsub@(hypot, larger);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_conjugate_@vsub@(const @vtype@ x)
{
    /*
     * __mm512_mask_xor_ps/pd requires AVX512DQ. We cast it to __m512i and
     * use the xor_epi32/64 uinstruction instead. Cast is a zero latency instruction
     */
    __m512i cast_x = _mm512_cast@vsub@_si512(x);
    __m512i res = _mm512_mask_xor_@epi_vsub@(cast_x, @cmpx_img_mask@,
                                        cast_x, _mm512_set1_@epi_vsub@(@neg_mask@));
    return _mm512_castsi512_@vsub@(res);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
{
    // x1 = r1, i1
    // x2 = r2, i2
    @vtype@ x3  = _mm512_permute_@vsub@(x2, @perm_@);   // i2, r2
    @vtype@ x12 = _mm512_mul_@vsub@(x1, x2);            // r1*r2, i1*i2
    @vtype@ x13 = _mm512_mul_@vsub@(x1, x3);            // r1*i2, r2*i1
    @vtype@ outreal = avx512_hsub_@vsub@(x12);          // r1*r2 - i1*i2, r1*r2 - i1*i2
    @vtype@ outimg  = avx512_hadd_@vsub@(x13);          // r1*i2 + i1*r2, r1*i2 + i1*r2
    return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
avx512_csquare_@vsub@(@vtype@ x)
{
    return avx512_cmul_@vsub@(x, x);
}

/**end repeat**/
#endif

/**begin repeat
 * #ISA = FMA, AVX512F#
 * #isa = fma, avx512#
 * #vtype = __m256, __m512#
 * #vsize = 256, 512#
 * #or = or_ps, kor#
 * #vsub = , _mask#
 * #mask = __m256, __mmask16#
 * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
 * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
 **/

#if defined @CHK@

/*
 * Vectorized Cody-Waite range reduction technique
 * Performs the reduction step x* = x - y*C in three steps:
 * 1) x* = x - y*c1
 * 2) x* = x - y*c2
 * 3) x* = x - y*c3
 * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
 */

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
@isa@_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
{
    @vtype@ reduced_x = @fmadd@(y, c1, x);
    reduced_x = @fmadd@(y, c2, reduced_x);
    reduced_x = @fmadd@(y, c3, reduced_x);
    return reduced_x;
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @mask@
@isa@_in_range_mask(@vtype@ x, npy_float fmax, npy_float fmin)
{
    @mask@ m1 = _mm@vsize@_cmp_ps@vsub@(
                                x, _mm@vsize@_set1_ps(fmax), _CMP_GT_OQ);
    @mask@ m2 = _mm@vsize@_cmp_ps@vsub@(
                                x, _mm@vsize@_set1_ps(fmin), _CMP_LT_OQ);
    return _mm@vsize@_@or@(m1,m2);
}

/*
 * Approximate cosine algorithm for x \in [-PI/4, PI/4]
 * Maximum ULP across all 32-bit floats = 0.875
 */

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
@isa@_cosine(@vtype@ x2, @vtype@ invf8, @vtype@ invf6, @vtype@ invf4,
                                                @vtype@ invf2, @vtype@ invf0)
{
    @vtype@ cos = @fmadd@(invf8, x2, invf6);
    cos = @fmadd@(cos, x2, invf4);
    cos = @fmadd@(cos, x2, invf2);
    cos = @fmadd@(cos, x2, invf0);
    return cos;
}

/*
 * Approximate sine algorithm for x \in [-PI/4, PI/4]
 * Maximum ULP across all 32-bit floats = 0.647
 */

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
@isa@_sine(@vtype@ x, @vtype@ x2, @vtype@ invf9, @vtype@ invf7,
                                          @vtype@ invf5, @vtype@ invf3,
                                          @vtype@ zero)
{
    @vtype@ sin = @fmadd@(invf9, x2, invf7);
    sin = @fmadd@(sin, x2, invf5);
    sin = @fmadd@(sin, x2, invf3);
    sin = @fmadd@(sin, x2, zero);
    sin = @fmadd@(sin, x, x);
    return sin;
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
@isa@_sqrt_ps(@vtype@ x)
{
    return _mm@vsize@_sqrt_ps(x);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
@isa@_sqrt_pd(@vtype@d x)
{
    return _mm@vsize@_sqrt_pd(x);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
@isa@_square_ps(@vtype@ x)
{
    return _mm@vsize@_mul_ps(x,x);
}

static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
@isa@_square_pd(@vtype@d x)
{
    return _mm@vsize@_mul_pd(x,x);
}

#endif
/**end repeat**/

/**begin repeat
 * #type = npy_float, npy_double#
 * #TYPE = FLOAT, DOUBLE#
 * #num_lanes = 16, 8#
 * #vsuffix = ps, pd#
 * #mask = __mmask16, __mmask8#
 * #vtype = __m512, __m512d#
 * #scale = 4, 8#
 * #vindextype = __m512i, __m256i#
 * #vindexsize = 512, 256#
 * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
 */

/**begin repeat1
 *  #func = maximum, minimum#
 *  #vectorf = max, min#
 */

#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE NPY_GCC_TARGET_AVX512F void
AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
{
    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
    const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(@type@);
    const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@);
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = array_size;
    @type@* ip1 = (@type@*) args[0];
    @type@* ip2 = (@type@*) args[1];
    @type@* op  = (@type@*) args[2];

    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
     */

    npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
        index_ip1[ii] = ii*stride_ip1;
        index_ip2[ii] = ii*stride_ip2;
        index_op[ii] = ii*stride_op;
    }
    @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
    @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
    @vindextype@ vindex_op  = @vindexload@((@vindextype@*)&index_op[0]);
    @vtype@ zeros_f = _mm512_setzero_@vsuffix@();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < @num_lanes@) {
            load_mask = avx512_get_partial_load_mask_@vsuffix@(
                                    num_remaining_elements, @num_lanes@);
        }
        @vtype@ x1, x2;
        if (stride_ip1 == 1) {
            x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
        }
        else {
            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
        }
        if (stride_ip2 == 1) {
            x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
        }
        else {
            x2 = avx512_masked_gather_@vsuffix@(zeros_f, ip2, vindex_ip2, load_mask);
        }

        /*
         * when only one of the argument is a nan, the maxps/maxpd instruction
         * returns the second argument. The additional blend instruction fixes
         * this issue to conform with NumPy behaviour.
         */
        @mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ);
        @vtype@ out = _mm512_@vectorf@_@vsuffix@(x1, x2);
        out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1);

        if (stride_op == 1) {
            _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
        }
        else {
            /* scatter! */
            _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
        }

        ip1 += @num_lanes@*stride_ip1;
        ip2 += @num_lanes@*stride_ip2;
        op += @num_lanes@*stride_op;
        num_remaining_elements -= @num_lanes@;
    }
}
#endif
/**end repeat**/
/**end repeat1**/

/**begin repeat
 * #ISA = FMA, AVX512F#
 * #isa = fma, avx512#
 * #vsize = 256, 512#
 * #BYTES = 32, 64#
 * #cvtps_epi32 = _mm256_cvtps_epi32, #
 * #mask = __m256, __mmask16#
 * #vsub = , _mask#
 * #vtype = __m256, __m512#
 * #cvtps_epi32 = _mm256_cvtps_epi32, #
 * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
 * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
 */

/**begin repeat1
 *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
 *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
 *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
 */

#if defined @CHK@
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@ISA@_@func@_FLOAT(npy_float* op,
                   npy_float* ip,
                   const npy_intp array_size,
                   const npy_intp steps)
{
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
    npy_intp num_remaining_elements = array_size;
    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
    @mask@ load_mask = @isa@_get_full_load_mask_ps();
#if @replace_0_with_1@
    @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask);
#endif

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_OUTPUT_BLOCKABLE_UNARY
     */

    npy_int32 indexarr[16];
    for (npy_int32 ii = 0; ii < 16; ii++) {
        indexarr[ii] = ii*stride;
    }
    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < num_lanes) {
            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
                                                       num_lanes);
#if @replace_0_with_1@
            inv_load_mask = @isa@_invert_mask_ps(load_mask);
#endif
        }
        @vtype@ x;
        if (stride == 1) {
            x = @isa@_masked_load_ps(load_mask, ip);
#if @replace_0_with_1@
            /*
             * Replace masked elements with 1.0f to avoid divide by zero fp
             * exception in reciprocal
             */
            x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask);
#endif
        }
        else {
            x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
        }
        @vtype@ out = @isa@_@vectorf@_ps(x);
        @masked_store@(op, @cvtps_epi32@(load_mask), out);

        ip += num_lanes*stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }
}
#endif
/**end repeat1**/
/**end repeat**/

/**begin repeat
 * #ISA = FMA, AVX512F#
 * #isa = fma, avx512#
 * #vsize = 256, 512#
 * #BYTES = 32, 64#
 * #cvtps_epi32 = _mm256_cvtps_epi32, #
 * #mask = __m256i, __mmask8#
 * #vsub = , _mask#
 * #vtype = __m256d, __m512d#
 * #vindextype = __m128i, __m256i#
 * #vindexsize = 128, 256#
 * #vindexload = _mm_loadu_si128, _mm256_loadu_si256#
 * #cvtps_epi32 = _mm256_cvtpd_epi32, #
 * #castmask = _mm256_castsi256_pd, #
 * #masked_store = _mm256_maskstore_pd, _mm512_mask_storeu_pd#
 * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
 */

/**begin repeat1
 *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
 *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
 *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
 */

#if defined @CHK@
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@ISA@_@func@_DOUBLE(npy_double* op,
                    npy_double* ip,
                    const npy_intp array_size,
                    const npy_intp steps)
{
    const npy_intp stride = steps/(npy_intp)sizeof(npy_double);
    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double);
    npy_intp num_remaining_elements = array_size;
    @mask@ load_mask = @isa@_get_full_load_mask_pd();
#if @replace_0_with_1@
    @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask);
#endif
    @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_OUTPUT_BLOCKABLE_UNARY
     */
    npy_int32 indexarr[8];
    for (npy_int32 ii = 0; ii < 8; ii++) {
        indexarr[ii] = ii*stride;
    }
    @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]);

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < num_lanes) {
            load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
                                                       num_lanes);
#if @replace_0_with_1@
            inv_load_mask = @isa@_invert_mask_pd(load_mask);
#endif
        }
        @vtype@ x;
        if (stride == 1) {
            x = @isa@_masked_load_pd(load_mask, ip);
#if @replace_0_with_1@
            /*
             * Replace masked elements with 1.0f to avoid divide by zero fp
             * exception in reciprocal
             */
            x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask));
#endif
        }
        else {
            x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));
        }
        @vtype@ out = @isa@_@vectorf@_pd(x);
        @masked_store@(op, load_mask, out);

        ip += num_lanes*stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }
}
#endif
/**end repeat1**/
/**end repeat**/

/**begin repeat
 * #ISA = FMA, AVX512F#
 * #isa = fma, avx512#
 * #vtype = __m256, __m512#
 * #vsize = 256, 512#
 * #BYTES = 32, 64#
 * #mask = __m256, __mmask16#
 * #vsub = , _mask#
 * #or_masks =_mm256_or_ps, _mm512_kor#
 * #and_masks =_mm256_and_ps, _mm512_kand#
 * #xor_masks =_mm256_xor_ps, _mm512_kxor#
 * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
 * #mask_to_int = _mm256_movemask_ps, #
 * #full_mask= 0xFF, 0xFFFF#
 * #masked_store = _mm256_maskstore_ps, _mm512_mask_storeu_ps#
 * #cvtps_epi32 = _mm256_cvtps_epi32, #
 * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
 */

/*
 * Vectorized approximate sine/cosine algorithms: The following code is a
 * vectorized version of the algorithm presented here:
 * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
 * (1) Load data in ZMM/YMM registers and generate mask for elements that are
 * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
 * 117435.992f] for sine.
 * (2) For elements within range, perform range reduction using Cody-Waite's
 * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4].
 * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k =
 * int(y).
 * (4) For elements outside that range, Cody-Waite reduction performs poorly
 * leading to catastrophic cancellation. We compute cosine by calling glibc in
 * a scalar fashion.
 * (5) Vectorized implementation has a max ULP of 1.49 and performs at least
 * 5-7x faster than scalar implementations when magnitude of all elements in
 * the array < 71476.0625f (117435.992f for sine). Worst case performance is
 * when all the elements are large leading to about 1-2% reduction in
 * performance.
 */

#if defined @CHK@
static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@ISA@_sincos_FLOAT(npy_float * op,
                   npy_float * ip,
                   const npy_intp array_size,
                   const npy_intp steps,
                   NPY_TRIG_OP my_trig_op)
{
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
    npy_float large_number = 71476.0625f;
    if (my_trig_op == npy_compute_sin) {
        large_number = 117435.992f;
    }

    /* Load up frequently used constants */
    @vtype@i zeros = _mm@vsize@_set1_epi32(0);
    @vtype@i ones = _mm@vsize@_set1_epi32(1);
    @vtype@i twos = _mm@vsize@_set1_epi32(2);
    @vtype@ two_over_pi = _mm@vsize@_set1_ps(NPY_TWO_O_PIf);
    @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_HIGHf);
    @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_MEDf);
    @vtype@ codyw_c3 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_PI_O_2_LOWf);
    @vtype@ cos_invf0 = _mm@vsize@_set1_ps(NPY_COEFF_INVF0_COSINEf);
    @vtype@ cos_invf2 = _mm@vsize@_set1_ps(NPY_COEFF_INVF2_COSINEf);
    @vtype@ cos_invf4 = _mm@vsize@_set1_ps(NPY_COEFF_INVF4_COSINEf);
    @vtype@ cos_invf6 = _mm@vsize@_set1_ps(NPY_COEFF_INVF6_COSINEf);
    @vtype@ cos_invf8 = _mm@vsize@_set1_ps(NPY_COEFF_INVF8_COSINEf);
    @vtype@ sin_invf3 = _mm@vsize@_set1_ps(NPY_COEFF_INVF3_SINEf);
    @vtype@ sin_invf5 = _mm@vsize@_set1_ps(NPY_COEFF_INVF5_SINEf);
    @vtype@ sin_invf7 = _mm@vsize@_set1_ps(NPY_COEFF_INVF7_SINEf);
    @vtype@ sin_invf9 = _mm@vsize@_set1_ps(NPY_COEFF_INVF9_SINEf);
    @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
    @vtype@ zero_f = _mm@vsize@_set1_ps(0.0f);
    @vtype@ quadrant, reduced_x, reduced_x2, cos, sin;
    @vtype@i iquadrant;
    @mask@ nan_mask, glibc_mask, sine_mask, negate_mask;
    @mask@ load_mask = @isa@_get_full_load_mask_ps();
    npy_intp num_remaining_elements = array_size;

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_OUTPUT_BLOCKABLE_UNARY
     */
    npy_int32 indexarr[16];
    for (npy_int32 ii = 0; ii < 16; ii++) {
        indexarr[ii] = ii*stride;
    }
    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);

    while (num_remaining_elements > 0) {

        if (num_remaining_elements < num_lanes) {
            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
                                                         num_lanes);
        }

        @vtype@ x;
        if (stride == 1) {
            x = @isa@_masked_load_ps(load_mask, ip);
        }
        else {
            x = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask);
        }

        /*
         * For elements outside of this range, Cody-Waite's range reduction
         * becomes inaccurate and we will call glibc to compute cosine for
         * these numbers
         */

        glibc_mask = @isa@_in_range_mask(x, large_number,-large_number);
        glibc_mask = @and_masks@(load_mask, glibc_mask);
        nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
        x = @isa@_set_masked_lanes_ps(x, zero_f, @or_masks@(nan_mask, glibc_mask));
        npy_int iglibc_mask = @mask_to_int@(glibc_mask);

        if (iglibc_mask != @full_mask@) {
            quadrant = _mm@vsize@_mul_ps(x, two_over_pi);

            /* round to nearest */
            quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
            quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);

            /* Cody-Waite's range reduction algorithm */
            reduced_x = @isa@_range_reduction(x, quadrant,
                                                   codyw_c1, codyw_c2, codyw_c3);
            reduced_x2 = _mm@vsize@_mul_ps(reduced_x, reduced_x);

            /* compute cosine and sine */
            cos = @isa@_cosine(reduced_x2, cos_invf8, cos_invf6, cos_invf4,
                                                           cos_invf2, cos_invf0);
            sin = @isa@_sine(reduced_x, reduced_x2, sin_invf9, sin_invf7,
                                             sin_invf5, sin_invf3, zero_f);

            iquadrant = _mm@vsize@_cvtps_epi32(quadrant);
            if (my_trig_op == npy_compute_cos) {
                iquadrant = _mm@vsize@_add_epi32(iquadrant, ones);
            }

            /* blend sin and cos based on the quadrant */
            sine_mask = @isa@_should_calculate_sine(iquadrant, ones, zeros);
            cos = @isa@_blend(cos, sin, sine_mask);

            /* multiply by -1 for appropriate elements */
            negate_mask = @isa@_should_negate(iquadrant, twos, twos);
            cos = @isa@_blend(cos, _mm@vsize@_sub_ps(zero_f, cos), negate_mask);
            cos = @isa@_set_masked_lanes_ps(cos, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);

            @masked_store@(op, @cvtps_epi32@(load_mask), cos);
        }

        /* process elements using glibc for large elements */
        if (my_trig_op == npy_compute_cos) {
            for (int ii = 0; iglibc_mask != 0; ii++) {
                if (iglibc_mask & 0x01) {
                    op[ii] = npy_cosf(ip[ii]);
                }
                iglibc_mask  = iglibc_mask >> 1;
            }
        }
        else {
            for (int ii = 0; iglibc_mask != 0; ii++) {
                if (iglibc_mask & 0x01) {
                    op[ii] = npy_sinf(ip[ii]);
                }
                iglibc_mask  = iglibc_mask >> 1;
            }
        }
        ip += num_lanes*stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }
}

/*
 * Vectorized implementation of exp using AVX2 and AVX512:
 * 1) if x >= xmax; return INF (overflow)
 * 2) if x <= xmin; return 0.0f (underflow)
 * 3) Range reduction (using Coyd-Waite):
 *      a) y = x - k*ln(2); k = rint(x/ln(2)); y \in [0, ln(2)]
 * 4) Compute exp(y) = P/Q, ratio of 2 polynomials P and Q
 *      b) P = 5th order and Q = 2nd order polynomials obtained from Remez's
 *      algorithm (mini-max polynomial approximation)
 * 5) Compute exp(x) = exp(y) * 2^k
 * 6) Max ULP error measured across all 32-bit FP's = 2.52 (x = 0xc2781e37)
 * 7) Max relative error measured across all 32-bit FP's= 2.1264E-07 (for the
 * same x = 0xc2781e37)
 */

static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@ISA@_exp_FLOAT(npy_float * op,
                npy_float * ip,
                const npy_intp array_size,
                const npy_intp steps)
{
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
    npy_float xmax = 88.72283935546875f;
    npy_float xmin = -103.97208404541015625f;

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_OUTPUT_BLOCKABLE_UNARY
     */
    npy_int32 indexarr[16];
    for (npy_int32 ii = 0; ii < 16; ii++) {
        indexarr[ii] = ii*stride;
    }

    /* Load up frequently used constants */
    @vtype@ codyw_c1 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
    @vtype@ codyw_c2 = _mm@vsize@_set1_ps(NPY_CODY_WAITE_LOGE_2_LOWf);
    @vtype@ exp_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_EXPf);
    @vtype@ exp_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_EXPf);
    @vtype@ exp_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_EXPf);
    @vtype@ exp_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_EXPf);
    @vtype@ exp_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_EXPf);
    @vtype@ exp_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_EXPf);
    @vtype@ exp_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_EXPf);
    @vtype@ exp_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_EXPf);
    @vtype@ exp_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_EXPf);
    @vtype@ cvt_magic = _mm@vsize@_set1_ps(NPY_RINT_CVT_MAGICf);
    @vtype@ log2e = _mm@vsize@_set1_ps(NPY_LOG2Ef);
    @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
    @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
    @vtype@ poly, num_poly, denom_poly, quadrant;
    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]);

    @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
    @mask@ overflow_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
    @mask@ load_mask = @isa@_get_full_load_mask_ps();
    npy_intp num_remaining_elements = array_size;

    while (num_remaining_elements > 0) {

        if (num_remaining_elements < num_lanes) {
            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
                                                       num_lanes);
        }

        @vtype@ x;
        if (stride == 1) {
            x = @isa@_masked_load_ps(load_mask, ip);
        }
        else {
            x = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
        }

        nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
        x = @isa@_set_masked_lanes_ps(x, zeros_f, nan_mask);

        xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
        xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
        inf_mask = _mm@vsize@_cmp_ps@vsub@(x, inf, _CMP_EQ_OQ);
        overflow_mask = @or_masks@(overflow_mask,
                                    @xor_masks@(xmax_mask, inf_mask));

        x = @isa@_set_masked_lanes_ps(x, zeros_f, @or_masks@(
                                    @or_masks@(nan_mask, xmin_mask), xmax_mask));

        quadrant = _mm@vsize@_mul_ps(x, log2e);

        /* round to nearest */
        quadrant = _mm@vsize@_add_ps(quadrant, cvt_magic);
        quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);

        /* Cody-Waite's range reduction algorithm */
        x = @isa@_range_reduction(x, quadrant, codyw_c1, codyw_c2, zeros_f);

        num_poly = @fmadd@(exp_p5, x, exp_p4);
        num_poly = @fmadd@(num_poly, x, exp_p3);
        num_poly = @fmadd@(num_poly, x, exp_p2);
        num_poly = @fmadd@(num_poly, x, exp_p1);
        num_poly = @fmadd@(num_poly, x, exp_p0);
        denom_poly = @fmadd@(exp_q2, x, exp_q1);
        denom_poly = @fmadd@(denom_poly, x, exp_q0);
        poly = _mm@vsize@_div_ps(num_poly, denom_poly);

        /*
         * compute val = poly * 2^quadrant; which is same as adding the
         * exponent of quadrant to the exponent of poly. quadrant is an int,
         * so extracting exponent is simply extracting 8 bits.
         */
        poly = @isa@_scalef_ps(poly, quadrant);

        /*
         * elem > xmax; return inf
         * elem < xmin; return 0.0f
         * elem = +/- nan, return nan
         */
        poly = @isa@_set_masked_lanes_ps(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
        poly = @isa@_set_masked_lanes_ps(poly, inf, xmax_mask);
        poly = @isa@_set_masked_lanes_ps(poly, zeros_f, xmin_mask);

        @masked_store@(op, @cvtps_epi32@(load_mask), poly);

        ip += num_lanes*stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }

    if (@mask_to_int@(overflow_mask)) {
        npy_set_floatstatus_overflow();
    }
}

/*
 * Vectorized implementation of log using AVX2 and AVX512
 * 1) if x < 0.0f; return -NAN (invalid input)
 * 2) Range reduction: y = x/2^k;
 *      a) y = normalized mantissa, k is the exponent (0.5 <= y < 1)
 * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q
 *      b) P = 5th order and Q = 5th order polynomials obtained from Remez's
 *      algorithm (mini-max polynomial approximation)
 * 5) Compute log(x) = log(y) + k*ln(2)
 * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945)
 * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same
 * x = 0x3f486945)
 */

static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
@ISA@_log_FLOAT(npy_float * op,
                npy_float * ip,
                const npy_intp array_size,
                const npy_intp steps)
{
    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via
     * IS_OUTPUT_BLOCKABLE_UNARY
     */
    npy_int32 indexarr[16];
    for (npy_int32 ii = 0; ii < 16; ii++) {
        indexarr[ii] = ii*stride;
    }

    /* Load up frequently used constants */
    @vtype@ log_p0 = _mm@vsize@_set1_ps(NPY_COEFF_P0_LOGf);
    @vtype@ log_p1 = _mm@vsize@_set1_ps(NPY_COEFF_P1_LOGf);
    @vtype@ log_p2 = _mm@vsize@_set1_ps(NPY_COEFF_P2_LOGf);
    @vtype@ log_p3 = _mm@vsize@_set1_ps(NPY_COEFF_P3_LOGf);
    @vtype@ log_p4 = _mm@vsize@_set1_ps(NPY_COEFF_P4_LOGf);
    @vtype@ log_p5 = _mm@vsize@_set1_ps(NPY_COEFF_P5_LOGf);
    @vtype@ log_q0 = _mm@vsize@_set1_ps(NPY_COEFF_Q0_LOGf);
    @vtype@ log_q1 = _mm@vsize@_set1_ps(NPY_COEFF_Q1_LOGf);
    @vtype@ log_q2 = _mm@vsize@_set1_ps(NPY_COEFF_Q2_LOGf);
    @vtype@ log_q3 = _mm@vsize@_set1_ps(NPY_COEFF_Q3_LOGf);
    @vtype@ log_q4 = _mm@vsize@_set1_ps(NPY_COEFF_Q4_LOGf);
    @vtype@ log_q5 = _mm@vsize@_set1_ps(NPY_COEFF_Q5_LOGf);
    @vtype@ loge2 = _mm@vsize@_set1_ps(NPY_LOGE2f);
    @vtype@ nan = _mm@vsize@_set1_ps(NPY_NANF);
    @vtype@ neg_nan = _mm@vsize@_set1_ps(-NPY_NANF);
    @vtype@ neg_inf = _mm@vsize@_set1_ps(-NPY_INFINITYF);
    @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
    @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
    @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
    @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)indexarr);
    @vtype@ poly, num_poly, denom_poly, exponent;

    @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
    @mask@ invalid_mask = @isa@_get_partial_load_mask_ps(0, num_lanes);
    @mask@ divide_by_zero_mask = invalid_mask;
    @mask@ load_mask = @isa@_get_full_load_mask_ps();
    npy_intp num_remaining_elements = array_size;

    while (num_remaining_elements > 0) {

        if (num_remaining_elements < num_lanes) {
            load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
                                                       num_lanes);
        }

        @vtype@ x_in;
        if (stride == 1) {
            x_in = @isa@_masked_load_ps(load_mask, ip);
        }
        else {
            x_in  = @isa@_masked_gather_ps(zeros_f, ip, vindex, load_mask);
        }

        negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
        zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ);
        inf_mask = _mm@vsize@_cmp_ps@vsub@(x_in, inf, _CMP_EQ_OQ);
        nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, x_in, _CMP_NEQ_UQ);
        divide_by_zero_mask = @or_masks@(divide_by_zero_mask,
                                        @and_masks@(zero_mask, load_mask));
        invalid_mask = @or_masks@(invalid_mask, negx_mask);

        @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zeros_f, negx_mask);

        /* set x = normalized mantissa */
        exponent = @isa@_get_exponent(x);
        x = @isa@_get_mantissa(x);

        /* if x < sqrt(2) {exp = exp-1; x = 2*x} */
        sqrt2_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(NPY_SQRT1_2f), _CMP_LE_OQ);
        x = @isa@_blend(x, _mm@vsize@_add_ps(x,x), sqrt2_mask);
        exponent = @isa@_blend(exponent,
                               _mm@vsize@_sub_ps(exponent,ones_f), sqrt2_mask);

        /* x = x - 1 */
        x = _mm@vsize@_sub_ps(x, ones_f);

        /* Polynomial approximation for log(1+x) */
        num_poly = @fmadd@(log_p5, x, log_p4);
        num_poly = @fmadd@(num_poly, x, log_p3);
        num_poly = @fmadd@(num_poly, x, log_p2);
        num_poly = @fmadd@(num_poly, x, log_p1);
        num_poly = @fmadd@(num_poly, x, log_p0);
        denom_poly = @fmadd@(log_q5, x, log_q4);
        denom_poly = @fmadd@(denom_poly, x, log_q3);
        denom_poly = @fmadd@(denom_poly, x, log_q2);
        denom_poly = @fmadd@(denom_poly, x, log_q1);
        denom_poly = @fmadd@(denom_poly, x, log_q0);
        poly = _mm@vsize@_div_ps(num_poly, denom_poly);
        poly = @fmadd@(exponent, loge2, poly);

        /*
         * x < 0.0f; return -NAN
         * x = +/- NAN; return NAN
         * x = 0.0f; return -INF
         */
        poly = @isa@_set_masked_lanes_ps(poly, nan, nan_mask);
        poly = @isa@_set_masked_lanes_ps(poly, neg_nan, negx_mask);
        poly = @isa@_set_masked_lanes_ps(poly, neg_inf, zero_mask);
        poly = @isa@_set_masked_lanes_ps(poly, inf, inf_mask);

        @masked_store@(op, @cvtps_epi32@(load_mask), poly);

        ip += num_lanes*stride;
        op += num_lanes;
        num_remaining_elements -= num_lanes;
    }

    if (@mask_to_int@(invalid_mask)) {
        npy_set_floatstatus_invalid();
    }
    if (@mask_to_int@(divide_by_zero_mask)) {
        npy_set_floatstatus_divbyzero();
    }
}
#endif
/**end repeat**/

/**begin repeat
 * #TYPE = CFLOAT, CDOUBLE#
 * #type = npy_float, npy_double#
 * #num_lanes = 16, 8#
 * #vsuffix = ps, pd#
 * #epi_vsub  = epi32, epi64#
 * #mask = __mmask16, __mmask8#
 * #vtype = __m512, __m512d#
 * #scale = 4, 8#
 * #vindextype = __m512i, __m256i#
 * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
 * #storemask = 0xFF, 0xF#
 * #IS_FLOAT = 1, 0#
 */

/**begin repeat1
 *  #func = add, subtract, multiply#
 *  #vectorf = _mm512_add, _mm512_sub, avx512_cmul#
 */

#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
{
    const npy_intp array_size = dimensions[0];
    npy_intp num_remaining_elements = 2*array_size;
    @type@* ip1 = (@type@*) args[0];
    @type@* ip2 = (@type@*) args[1];
    @type@* op  = (@type@*) args[2];

    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < @num_lanes@) {
            load_mask = avx512_get_partial_load_mask_@vsuffix@(
                                    num_remaining_elements, @num_lanes@);
        }
        @vtype@ x1, x2;
        x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
        x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);

        @vtype@ out = @vectorf@_@vsuffix@(x1, x2);

        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);

        ip1 += @num_lanes@;
        ip2 += @num_lanes@;
        op += @num_lanes@;
        num_remaining_elements -= @num_lanes@;
    }
}
#endif
/**end repeat1**/

/**begin repeat1
 *  #func = square, conjugate#
 *  #vectorf = avx512_csquare, avx512_conjugate#
 */

#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
AVX512F_@func@_@TYPE@(@type@ * op,
                      @type@ * ip,
                      const npy_intp array_size,
                      const npy_intp steps)
{
    npy_intp num_remaining_elements = 2*array_size;
    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;

     /*
      * Note: while generally indices are npy_intp, we ensure that our maximum index
      * will fit in an int32 as a precondition for this function via max_stride
      */
    npy_int32 index_ip1[16];
    for (npy_int32 ii = 0; ii < @num_lanes@; ii=ii+2) {
        index_ip1[ii] = ii*stride_ip1;
        index_ip1[ii+1] = ii*stride_ip1 + 1;
    }
    @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1);
    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
    @vtype@ zeros = _mm512_setzero_@vsuffix@();

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < @num_lanes@) {
            load_mask = avx512_get_partial_load_mask_@vsuffix@(
                                    num_remaining_elements, @num_lanes@);
        }
        @vtype@ x1;
        if (stride_ip1 == 1) {
            x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
        }
        else {
            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex, load_mask);
        }

        @vtype@ out = @vectorf@_@vsuffix@(x1);

        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
        op += @num_lanes@;
        ip += @num_lanes@*stride_ip1;
        num_remaining_elements -= @num_lanes@;
    }
}
#endif
/**end repeat1**/

#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void
AVX512F_absolute_@TYPE@(@type@ * op,
                        @type@ * ip,
                        const npy_intp array_size,
                        const npy_intp steps)
{
    npy_intp num_remaining_elements = 2*array_size;
    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;

    /*
     * Note: while generally indices are npy_intp, we ensure that our maximum index
     * will fit in an int32 as a precondition for this function via max_stride
     */
    npy_int32 index_ip[32];
    for (npy_int32 ii = 0; ii < 2*@num_lanes@; ii=ii+2) {
        index_ip[ii] = ii*stride_ip1;
        index_ip[ii+1] = ii*stride_ip1 + 1;
    }
    @vindextype@ vindex1 = @vindexload@((@vindextype@*)index_ip);
    @vindextype@ vindex2 = @vindexload@((@vindextype@*)(index_ip+@num_lanes@));

    @mask@ load_mask1 = avx512_get_full_load_mask_@vsuffix@();
    @mask@ load_mask2 = avx512_get_full_load_mask_@vsuffix@();
    @mask@ store_mask = avx512_get_full_load_mask_@vsuffix@();
    @vtype@ zeros = _mm512_setzero_@vsuffix@();

#if @IS_FLOAT@
    __m512i re_index = _mm512_set_epi32(30,28,26,24,22,20,18,16,14,12,10,8,6,4,2,0);
    __m512i im_index  = _mm512_set_epi32(31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1);
#else
    __m512i re_index = _mm512_set_epi64(14,12,10,8,6,4,2,0);
    __m512i im_index  = _mm512_set_epi64(15,13,11,9,7,5,3,1);
#endif

    while (num_remaining_elements > 0) {
        if (num_remaining_elements < @num_lanes@) {
            load_mask1 = avx512_get_partial_load_mask_@vsuffix@(
                                    num_remaining_elements, @num_lanes@);
            load_mask2 = 0x0000;
            store_mask = avx512_get_partial_load_mask_@vsuffix@(
                                    num_remaining_elements/2, @num_lanes@);
        } else if (num_remaining_elements < 2*@num_lanes@) {
            load_mask1 = avx512_get_full_load_mask_@vsuffix@();
            load_mask2 = avx512_get_partial_load_mask_@vsuffix@(
                                    num_remaining_elements - @num_lanes@, @num_lanes@);
            store_mask = avx512_get_partial_load_mask_@vsuffix@(
                                    num_remaining_elements/2, @num_lanes@);
        }
        @vtype@ x1, x2;
        if (stride_ip1 == 1) {
            x1 = avx512_masked_load_@vsuffix@(load_mask1, ip);
            x2 = avx512_masked_load_@vsuffix@(load_mask2, ip+@num_lanes@);
        }
        else {
            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex1, load_mask1);
            x2  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex2, load_mask2);
        }

        @vtype@ out = avx512_cabsolute_@vsuffix@(x1, x2, re_index, im_index);

        _mm512_mask_storeu_@vsuffix@(op, store_mask, out);
        op += @num_lanes@;
        ip += 2*@num_lanes@*stride_ip1;
        num_remaining_elements -= 2*@num_lanes@;
    }
    npy_clear_floatstatus_barrier((char*)op);
}

#endif
/**end repeat**/

/*
 *****************************************************************************
 **                           BOOL LOOPS
 *****************************************************************************
 */

/**begin repeat
 * # kind = logical_or, logical_and#
 * # and = 0, 1#
 * # op = ||, &&#
 * # sc = !=, ==#
 * # vpre = _mm*2#
 * # vsuf = si128*2#
 * # vtype = __m128i*2#
 * # type = npy_bool*2#
 * # vload = _mm_load_si128*2#
 * # vloadu = _mm_loadu_si128*2#
 * # vstore = _mm_store_si128*2#
 */

/*
 * convert any bit set to boolean true so vectorized and normal operations are
 * consistent, should not be required if bool is used correctly everywhere but
 * you never know
 */
#if !@and@
static NPY_INLINE @vtype@ byte_to_true(@vtype@ v)
{
    const @vtype@ zero = @vpre@_setzero_@vsuf@();
    const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
    /* get 0xFF for zeros */
    @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero);
    /* filled with 0xFF/0x00, negate and mask to boolean true */
    return @vpre@_andnot_@vsuf@(tmp, truemask);
}
#endif

static void
sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n)
{
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
        op[i] = ip1[i] @op@ ip2[i];
    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
        @vtype@ a = @vloadu@((@vtype@*)&ip1[i]);
        @vtype@ b = @vloadu@((@vtype@*)&ip2[i]);
#if @and@
        const @vtype@ zero = @vpre@_setzero_@vsuf@();
        /* get 0xFF for non zeros*/
        @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero);
        /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */
        tmp = @vpre@_andnot_@vsuf@(tmp, b);
#else
        @vtype@ tmp = @vpre@_or_@vsuf@(a, b);
#endif

        @vstore@((@vtype@*)&op[i], byte_to_true(tmp));
    }
    LOOP_BLOCKED_END {
        op[i] = (ip1[i] @op@ ip2[i]);
    }
}


static void
sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
{
    const @vtype@ zero = @vpre@_setzero_@vsuf@();
    LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, VECTOR_SIZE_BYTES) {
        *op = *op @op@ ip[i];
        if (*op @sc@ 0) {
            return;
        }
    }
    /* unrolled once to replace a slow movmsk with a fast pmaxb */
    LOOP_BLOCKED(npy_bool, 2 * VECTOR_SIZE_BYTES) {
        @vtype@ v = @vload@((@vtype@*)&ip[i]);
        @vtype@ v2 = @vload@((@vtype@*)&ip[i + VECTOR_SIZE_BYTES]);
        v = @vpre@_cmpeq_epi8(v, zero);
        v2 = @vpre@_cmpeq_epi8(v2, zero);
#if @and@
        if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) {
            *op = 0;
#else
        if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) {
            *op = 1;
#endif
            return;
        }
    }
    LOOP_BLOCKED_END {
        *op = *op @op@ ip[i];
        if (*op @sc@ 0) {
            return;
        }
    }
}

/**end repeat**/

/**begin repeat
 * # kind = absolute, logical_not#
 * # op = !=, ==#
 * # not = 0, 1#
 * # vpre = _mm*2#
 * # vsuf = si128*2#
 * # vtype = __m128i*2#
 * # type = npy_bool*2#
 * # vloadu = _mm_loadu_si128*2#
 * # vstore = _mm_store_si128*2#
 */

static void
sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
{
    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
        op[i] = (ip[i] @op@ 0);
    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
        @vtype@ a = @vloadu@((@vtype@*)&ip[i]);
#if @not@
        const @vtype@ zero = @vpre@_setzero_@vsuf@();
        const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
        /* equivalent to byte_to_true but can skip the negation */
        a = @vpre@_cmpeq_epi8(a, zero);
        a = @vpre@_and_@vsuf@(a, truemask);
#else
        /* abs is kind of pointless but maybe its used for byte_to_true */
        a = byte_to_true(a);
#endif
        @vstore@((@vtype@*)&op[i], a);
    }
    LOOP_BLOCKED_END {
        op[i] = (ip[i] @op@ 0);
    }
}

/**end repeat**/

#undef VECTOR_SIZE_BYTES

#endif /* NPY_HAVE_SSE2_INTRINSICS */

#endif
